`

Load the tweets and check if they are loaded correctly. We also check the summary for a first interpretation. The summary(tweets) output reveals the following:

# Set working directory
# getwd()
# setwd("./data/")

# Load data
load("../data/Tweets_all.rda")

# Check that tweets are loaded
head(tweets)
## # A tibble: 6 × 14
##   created_at               id id_str            full_text in_reply_to_screen_n…¹
##   <dttm>                <dbl> <chr>             <chr>     <chr>                 
## 1 2023-01-20 17:17:32 1.62e18 1616469988369469… "Im MSc … <NA>                  
## 2 2023-01-13 07:52:01 1.61e18 1613790954737074… "Was bew… <NA>                  
## 3 2023-01-12 19:30:01 1.61e18 1613604227141537… "Was uns… <NA>                  
## 4 2023-01-12 08:23:00 1.61e18 1613436367169634… "Eine di… <NA>                  
## 5 2023-01-11 14:00:05 1.61e18 1613158809081450… "Wir gra… <NA>                  
## 6 2023-01-10 17:06:11 1.61e18 1612843252083834… "Unsere … <NA>                  
## # ℹ abbreviated name: ¹​in_reply_to_screen_name
## # ℹ 9 more variables: retweet_count <int>, favorite_count <int>, lang <chr>,
## #   university <chr>, tweet_date <dttm>, tweet_minute <dttm>,
## #   tweet_hour <dttm>, tweet_month <date>, timeofday_hour <chr>
summary(tweets)
##    created_at                          id               id_str         
##  Min.   :2009-09-29 14:29:47.0   Min.   :4.469e+09   Length:19575      
##  1st Qu.:2015-01-28 15:07:41.5   1st Qu.:5.604e+17   Class :character  
##  Median :2018-04-13 13:26:56.0   Median :9.848e+17   Mode  :character  
##  Mean   :2017-12-09 15:26:50.7   Mean   :9.400e+17                     
##  3rd Qu.:2020-10-20 10:34:50.0   3rd Qu.:1.318e+18                     
##  Max.   :2023-01-26 14:49:31.0   Max.   :1.619e+18                     
##   full_text         in_reply_to_screen_name retweet_count     favorite_count  
##  Length:19575       Length:19575            Min.   :  0.000   Min.   :  0.00  
##  Class :character   Class :character        1st Qu.:  0.000   1st Qu.:  0.00  
##  Mode  :character   Mode  :character        Median :  1.000   Median :  0.00  
##                                             Mean   :  1.289   Mean   :  1.37  
##                                             3rd Qu.:  2.000   3rd Qu.:  2.00  
##                                             Max.   :267.000   Max.   :188.00  
##      lang            university          tweet_date                    
##  Length:19575       Length:19575       Min.   :2009-09-29 00:00:00.00  
##  Class :character   Class :character   1st Qu.:2015-01-28 00:00:00.00  
##  Mode  :character   Mode  :character   Median :2018-04-13 00:00:00.00  
##                                        Mean   :2017-12-09 02:25:45.00  
##                                        3rd Qu.:2020-10-20 00:00:00.00  
##                                        Max.   :2023-01-26 00:00:00.00  
##   tweet_minute                      tweet_hour                    
##  Min.   :2009-09-29 14:29:00.00   Min.   :2009-09-29 14:00:00.00  
##  1st Qu.:2015-01-28 15:07:00.00   1st Qu.:2015-01-28 14:30:00.00  
##  Median :2018-04-13 13:26:00.00   Median :2018-04-13 13:00:00.00  
##  Mean   :2017-12-09 15:26:24.68   Mean   :2017-12-09 14:59:43.81  
##  3rd Qu.:2020-10-20 10:34:30.00   3rd Qu.:2020-10-20 10:00:00.00  
##  Max.   :2023-01-26 14:49:00.00   Max.   :2023-01-26 14:00:00.00  
##   tweet_month         timeofday_hour    
##  Min.   :2009-09-01   Length:19575      
##  1st Qu.:2015-01-01   Class :character  
##  Median :2018-04-01   Mode  :character  
##  Mean   :2017-11-24                     
##  3rd Qu.:2020-10-01                     
##  Max.   :2023-01-01

Start preprocessing the tweets, to calculate the intervalls some additional properties are needed. The preprocessing steps transform raw tweet data into a structured format suitable for analysis. This includes:

# Preprocessing Step: Convert date and time to POSIXct and format according to date, year and university. Detect language and extract emojis. The days are sorted from the system locale starting from monday
tweets <- tweets %>%
  mutate(
    created_at = as.POSIXct(created_at, format = "%Y-%m-%d %H:%M:%S"),
    date = as.Date(created_at),
    day = lubridate::wday(created_at,
      label = TRUE, abbr = FALSE,
      week_start = getOption("lubridate.week.start", 1),
      locale = Sys.getlocale("LC_TIME")
    ),
    year = year(created_at),
    month = floor_date(created_at, "month"),
    university = as.character(university),
    lang = detect_language(full_text),
    full_text_emojis = replace_emoji(full_text, emoji_dt = lexicon::hash_emojis)
  )

# Remove Emoji Tags helper funciton
# replace emoji places the emojis in the text as tags and their name, we remove them here
remove_emoji_tags <- function(text) {
  str_remove_all(text, "<[a-z0-9]{2}>")
}
# Remove Emoji Tags
tweets$full_text_emojis <- sapply(tweets$full_text_emojis, remove_emoji_tags)

# Store emojis in a sep arate column to analyze later
tweets$emoji_unicode <- tweets %>%
  emoji_extract_nest(full_text) %>%
  select(.emoji_unicode)

Question 1: How many tweets are being posted by the various Universities when? Are there any ‘release’ strategies visible?

# Count each tweet by university and hour of the day
tweet_counts_by_hour_of_day <- tweets %>%
  group_by(university, timeofday_hour) %>%
  count() %>%
  arrange(university, timeofday_hour)

# Plot the number of tweets by university and hour of the day
ggplot(
  tweet_counts_by_hour_of_day,
  aes(
    x = timeofday_hour, y = n,
    color = university, group = university
  )
) +
  geom_line() +
  facet_wrap(~university) +
  labs(
    title = "Number of tweets by university and hour",
    x = "Hour of day",
    y = "Number of tweets"
  )

# Show most active hours for each university
hours_with_most_tweets_by_uni <- tweet_counts_by_hour_of_day %>%
  group_by(university, timeofday_hour) %>%
  summarize(total_tweets = sum(n)) %>%
  group_by(university) %>%
  slice_max(n = 1, order_by = total_tweets)

print(hours_with_most_tweets_by_uni)
## # A tibble: 8 × 3
## # Groups:   university [8]
##   university     timeofday_hour total_tweets
##   <chr>          <chr>                 <int>
## 1 FHNW           09                      344
## 2 FH_Graubuenden 11                      493
## 3 ZHAW           17                      580
## 4 bfh            08                      497
## 5 hes_so         10                      315
## 6 hslu           09                      380
## 7 ost_fh         08                       44
## 8 supsi_ch       11                      330
# Show most active hour overall
hour_with_most_tweets <- tweet_counts_by_hour_of_day %>%
  group_by(timeofday_hour) %>%
  summarize(total_tweets = sum(n)) %>%
  arrange(desc(total_tweets)) %>%
  slice_max(n = 1, order_by = total_tweets)

print(hour_with_most_tweets)
## # A tibble: 1 × 2
##   timeofday_hour total_tweets
##   <chr>                 <int>
## 1 11                     2356
# Count each tweet by university and weekday
tweet_counts_by_week_day <- tweets %>%
  group_by(university, day) %>%
  count() %>%
  arrange(university, day)

# Plot the number of tweets by university and day of the week
ggplot(
  tweet_counts_by_week_day,
  aes(
    x = day, y = n,
    color = university,
    group = university
  )
) +
  geom_line() +
  facet_wrap(~university) +
  labs(
    title = "Number of tweets by university and day of the week",
    x = "Day of the week",
    y = "Number of tweets"
  )

# Show most active days for each university
days_with_most_tweets_by_uni <- tweet_counts_by_week_day %>%
  group_by(university, day) %>%
  summarize(total_tweets = sum(n)) %>%
  group_by(university) %>%
  slice_max(n = 1, order_by = total_tweets)

print(days_with_most_tweets_by_uni)
## # A tibble: 8 × 3
## # Groups:   university [8]
##   university     day       total_tweets
##   <chr>          <ord>            <int>
## 1 FHNW           Tuesday            575
## 2 FH_Graubuenden Tuesday            751
## 3 ZHAW           Wednesday          636
## 4 bfh            Tuesday            651
## 5 hes_so         Tuesday            415
## 6 hslu           Thursday           603
## 7 ost_fh         Friday              65
## 8 supsi_ch       Friday             461
# Calculate time intervals between tweets
find_mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

tweets <- tweets %>%
  arrange(university, created_at) %>%
  group_by(university) %>%
  mutate(time_interval = as.numeric(
    difftime(created_at, lag(created_at), units = "mins")
  ))

# Descriptive statistics of time intervals
summary(tweets$time_interval)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##      0.0    148.2   1128.8   2097.6   2428.3 220707.0        8
# setwd("../4.Text-Mining-Groupwork/plots")
unique_years <- tweets$year %>% unique()
# Pilot distribution of time intervals between tweets for each year
for (curr_year in unique_years) {
  # Filter data for the specific year
  filtered_data <- tweets %>%
    filter(year(created_at) == curr_year)

  print(ggplot(filtered_data, aes(x = time_interval)) +
    geom_histogram(fill = "lightblue") +
    facet_wrap(~university) +
    labs(
      title = paste0(
        "Distribution of time intervals between tweets - ", curr_year
      ),
      x = "Time interval (minutes)",
      y = "Tweet count"
    ))
  universities <- filtered_data$university %>% unique()
  for (uni in universities) {
    # Filter data for the specific university
    uni_filtered_data <- filtered_data %>%
      filter(university == uni)

    print(ggplot(uni_filtered_data, aes(x = time_interval)) +
      geom_histogram(fill = "lightblue") +
      labs(
        title = paste0(
          "Distribution of time intervals between tweets for ", uni,
          " in ", curr_year
        ),
        x = "Time interval (minutes)",
        y = "Tweet count"
      ))
    # Calculate mode (most common interval) in hours
    most_common_interval_minutes <- find_mode(uni_filtered_data$time_interval)
    most_common_interval_hours <- most_common_interval_minutes / 60
    print(paste0(
      "Most common time interval for ", uni,
      " in ",
      curr_year,
      " is ", most_common_interval_minutes,
      " minutes (", most_common_interval_hours, " hours)"
    ))
  }
}

## [1] "Most common time interval for FHNW in 2011 is NA minutes (NA hours)"

## [1] "Most common time interval for FH_Graubuenden in 2011 is 23210.3 minutes (386.838333333333 hours)"

## [1] "Most common time interval for hes_so in 2011 is 1.55 minutes (0.0258333333333333 hours)"

## [1] "Most common time interval for FHNW in 2012 is 17324.65 minutes (288.744166666667 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2012 is 0.9 minutes (0.015 hours)"

## [1] "Most common time interval for ZHAW in 2012 is NA minutes (NA hours)"

## [1] "Most common time interval for bfh in 2012 is NA minutes (NA hours)"

## [1] "Most common time interval for hes_so in 2012 is 22086.35 minutes (368.105833333333 hours)"

## [1] "Most common time interval for FHNW in 2013 is 1.26666666666667 minutes (0.0211111111111111 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2013 is 21879.45 minutes (364.6575 hours)"

## [1] "Most common time interval for ZHAW in 2013 is 0.583333333333333 minutes (0.00972222222222222 hours)"

## [1] "Most common time interval for bfh in 2013 is 65.0833333333333 minutes (1.08472222222222 hours)"

## [1] "Most common time interval for hes_so in 2013 is 36252.5833333333 minutes (604.209722222222 hours)"

## [1] "Most common time interval for supsi_ch in 2013 is 0.783333333333333 minutes (0.0130555555555556 hours)"

## [1] "Most common time interval for FHNW in 2014 is 4.58333333333333 minutes (0.0763888888888889 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2014 is 0.183333333333333 minutes (0.00305555555555556 hours)"

## [1] "Most common time interval for ZHAW in 2014 is 0.05 minutes (0.000833333333333333 hours)"

## [1] "Most common time interval for bfh in 2014 is 153.35 minutes (2.55583333333333 hours)"

## [1] "Most common time interval for hes_so in 2014 is 21986.6 minutes (366.443333333333 hours)"

## [1] "Most common time interval for supsi_ch in 2014 is 37496.4833333333 minutes (624.941388888889 hours)"

## [1] "Most common time interval for FHNW in 2015 is 48918.3 minutes (815.305 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2015 is 1139.9 minutes (18.9983333333333 hours)"

## [1] "Most common time interval for ZHAW in 2015 is 0.316666666666667 minutes (0.00527777777777778 hours)"

## [1] "Most common time interval for bfh in 2015 is 20272.0333333333 minutes (337.867222222222 hours)"

## [1] "Most common time interval for hes_so in 2015 is 0.166666666666667 minutes (0.00277777777777778 hours)"

## [1] "Most common time interval for supsi_ch in 2015 is 43496.6333333333 minutes (724.943888888889 hours)"

## [1] "Most common time interval for FHNW in 2016 is 34708.6666666667 minutes (578.477777777778 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2016 is 240.05 minutes (4.00083333333333 hours)"

## [1] "Most common time interval for ZHAW in 2016 is 21.2 minutes (0.353333333333333 hours)"

## [1] "Most common time interval for bfh in 2016 is 0.0833333333333333 minutes (0.00138888888888889 hours)"

## [1] "Most common time interval for hes_so in 2016 is 2.7 minutes (0.045 hours)"

## [1] "Most common time interval for hslu in 2016 is NA minutes (NA hours)"

## [1] "Most common time interval for supsi_ch in 2016 is 1.58333333333333 minutes (0.0263888888888889 hours)"

## [1] "Most common time interval for FHNW in 2017 is 48748.5333333333 minutes (812.475555555556 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2017 is 5617.83333333333 minutes (93.6305555555555 hours)"

## [1] "Most common time interval for ZHAW in 2017 is 6954.43333333333 minutes (115.907222222222 hours)"

## [1] "Most common time interval for bfh in 2017 is 18606.6666666667 minutes (310.111111111111 hours)"

## [1] "Most common time interval for hes_so in 2017 is 71909.9833333333 minutes (1198.49972222222 hours)"

## [1] "Most common time interval for hslu in 2017 is 0.266666666666667 minutes (0.00444444444444444 hours)"

## [1] "Most common time interval for supsi_ch in 2017 is 1.36666666666667 minutes (0.0227777777777778 hours)"

## [1] "Most common time interval for FHNW in 2018 is 0.166666666666667 minutes (0.00277777777777778 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2018 is 1446.23333333333 minutes (24.1038888888889 hours)"

## [1] "Most common time interval for ZHAW in 2018 is 5689.93333333333 minutes (94.8322222222222 hours)"

## [1] "Most common time interval for bfh in 2018 is 20172.05 minutes (336.200833333333 hours)"

## [1] "Most common time interval for hes_so in 2018 is 31170.8333333333 minutes (519.513888888889 hours)"

## [1] "Most common time interval for hslu in 2018 is 0.233333333333333 minutes (0.00388888888888889 hours)"

## [1] "Most common time interval for supsi_ch in 2018 is 0.183333333333333 minutes (0.00305555555555556 hours)"

## [1] "Most common time interval for FHNW in 2019 is 315.233333333333 minutes (5.25388888888889 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2019 is 10079.85 minutes (167.9975 hours)"

## [1] "Most common time interval for ZHAW in 2019 is 1255.61666666667 minutes (20.9269444444444 hours)"

## [1] "Most common time interval for bfh in 2019 is 1440.05 minutes (24.0008333333333 hours)"

## [1] "Most common time interval for hes_so in 2019 is 1140.03333333333 minutes (19.0005555555556 hours)"

## [1] "Most common time interval for hslu in 2019 is 1.95 minutes (0.0325 hours)"

## [1] "Most common time interval for supsi_ch in 2019 is 15 minutes (0.25 hours)"

## [1] "Most common time interval for FHNW in 2020 is 3180.16666666667 minutes (53.0027777777778 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2020 is 2880.03333333333 minutes (48.0005555555556 hours)"

## [1] "Most common time interval for ZHAW in 2020 is 13693.7666666667 minutes (228.229444444444 hours)"

## [1] "Most common time interval for bfh in 2020 is 14531.7333333333 minutes (242.195555555556 hours)"

## [1] "Most common time interval for hes_so in 2020 is 1139.91666666667 minutes (18.9986111111111 hours)"

## [1] "Most common time interval for hslu in 2020 is 120 minutes (2 hours)"

## [1] "Most common time interval for ost_fh in 2020 is NA minutes (NA hours)"

## [1] "Most common time interval for supsi_ch in 2020 is 0.133333333333333 minutes (0.00222222222222222 hours)"

## [1] "Most common time interval for FHNW in 2021 is 0.5 minutes (0.00833333333333333 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2021 is 0.333333333333333 minutes (0.00555555555555555 hours)"

## [1] "Most common time interval for ZHAW in 2021 is 13043.9833333333 minutes (217.399722222222 hours)"

## [1] "Most common time interval for bfh in 2021 is 1411.05 minutes (23.5175 hours)"

## [1] "Most common time interval for hes_so in 2021 is 0 minutes (0 hours)"

## [1] "Most common time interval for hslu in 2021 is 0 minutes (0 hours)"

## [1] "Most common time interval for ost_fh in 2021 is 0.35 minutes (0.00583333333333333 hours)"

## [1] "Most common time interval for supsi_ch in 2021 is 1140 minutes (19 hours)"

## [1] "Most common time interval for FHNW in 2022 is 1439.93333333333 minutes (23.9988888888889 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2022 is 0.1 minutes (0.00166666666666667 hours)"

## [1] "Most common time interval for ZHAW in 2022 is 18623.7166666667 minutes (310.395277777778 hours)"

## [1] "Most common time interval for bfh in 2022 is 7192.96666666667 minutes (119.882777777778 hours)"

## [1] "Most common time interval for hes_so in 2022 is 5798.53333333333 minutes (96.6422222222222 hours)"

## [1] "Most common time interval for hslu in 2022 is 0 minutes (0 hours)"

## [1] "Most common time interval for ost_fh in 2022 is 0.133333333333333 minutes (0.00222222222222222 hours)"

## [1] "Most common time interval for supsi_ch in 2022 is 28800.7333333333 minutes (480.012222222222 hours)"

## [1] "Most common time interval for FHNW in 2023 is 9997.63333333333 minutes (166.627222222222 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2023 is 21962.3833333333 minutes (366.039722222222 hours)"

## [1] "Most common time interval for ZHAW in 2023 is 70740.3333333333 minutes (1179.00555555556 hours)"

## [1] "Most common time interval for bfh in 2023 is 8000.11666666667 minutes (133.335277777778 hours)"

## [1] "Most common time interval for hes_so in 2023 is 4621.1 minutes (77.0183333333333 hours)"

## [1] "Most common time interval for hslu in 2023 is 627.083333333333 minutes (10.4513888888889 hours)"

## [1] "Most common time interval for supsi_ch in 2023 is 7199 minutes (119.983333333333 hours)"

## [1] "Most common time interval for FH_Graubuenden in 2009 is NA minutes (NA hours)"

## [1] "Most common time interval for FH_Graubuenden in 2010 is 55732.2833333333 minutes (928.871388888889 hours)"

## [1] "Most common time interval for hes_so in 2010 is NA minutes (NA hours)"

Question 2: What are the tweets about and how do other Twitter users react to them (likes, etc.)?

Data Preprocessing

langs <- c("de", "fr", "it", "en")
tweets_filtered <- tweets %>%
  filter(lang %in% langs)
# Define extended stopwords (outside loop for efficiency)
# Remove 'amp' as it is not meaningful because its only & symbol
# Remove 'rt' because it is an word e.g 'engagiert'.
extended_stopwords <- c(
  "#fhnw", "#bfh", "@htw_chur", "#hslu", "#supsi", "#sups",
  "amp", "rt", "fr", "ber"
)
# Create separate DFMs for each language
dfm_list <- list()
for (sel_lang in langs) {
  # Subset tweets for the current language
  tweets_lang <- tweets_filtered %>%
    filter(lang == sel_lang)
  # Create tokens for the current language
  stopwords_lang <- stopwords(sel_lang)
  # Create tokens for all tweets:
  # - create corpus and tokens because tokensonly works on character, corpus, list, tokens, tokens_xptr objects.
  # - create tokens and remove: URLS, Punctuation, Numbers, Symbols, Separators
  # - transform to lowercase
  # - Stem all words
  # - Create n-grams of any length (not includinf bigrams and trigrams but they are shown later)
  # - It is important to remove the stopwords after stemming the words because we remove the endings from some stem words
  tokens_lang <- tweets_lang %>%
    corpus(text_field = "full_text_emojis") %>%
    tokens(
      remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE,
      remove_url = TRUE, remove_separators = TRUE
    ) %>%
    tokens_tolower() %>%
    tokens_wordstem(lang = sel_lang) %>%
    tokens_ngrams(n = 1) %>%
    tokens_select(
      pattern =
        c(stopwords_lang, extended_stopwords), selection = "remove"
    )
  # Create DFM for the current language
  dfm_list[[sel_lang]] <- dfm(tokens_lang)
}

Content Analysis

  • Hslu has much english tweets
  • BFH and FHNW has much german tweets
  • Supsi have much italian tweets, the most common words are in italian
  • hes-so have much french tweets, the most common words are in french
# Word Frequencies & Visualization
words_freqs_en <- sort(colSums(dfm_list$en), decreasing = TRUE)
head(words_freqs_en, 20)
##     student         new       @hslu     univers     project       thank 
##         106          74          70          62          60          60 
##       @zhaw         day      scienc       today       innov         now 
##          59          56          54          52          51          50 
##       swiss switzerland       @fhnw       great          us        join 
##          49          49          46          46          44          43 
##       studi    research 
##          42          42
wordcloud2(data.frame(
  word = names(words_freqs_en),
  freq = words_freqs_en
), size = 0.5)
words_freqs_de <- sort(colSums(dfm_list$de), decreasing = TRUE)
head(words_freqs_de, 20)
##       neu      mehr   schweiz      werd       all   studier      heut hochschul 
##      1586      1104       967       772       706       706       638       601 
##       bfh      jahr     knnen   digital     thema     studi   projekt     welch 
##       577       535       507       499       497       466       465       462 
##      bern     statt     zeigt    arbeit 
##       454       451       437       434
wordcloud2(data.frame(
  word = names(words_freqs_de),
  freq = words_freqs_de
), size = 0.5)
word_freqs_it <- sort(colSums(dfm_list$it), decreasing = TRUE)
head(word_freqs_it, 20)
##        nuov        sups     progett     student     present        info 
##         210         208         173         146         143         143 
##   iscrizion        cors      ricerc   formazion  #supsinews #supsievent 
##         142         141         135         134         134         129 
##       scopr      inform      diplom    bachelor       apert        tutt 
##         123         120         116         111         110         105 
##      master          pi 
##         103         102
wordcloud2(data.frame(
  word = names(word_freqs_it),
  freq = word_freqs_it
), size = 0.5)
# It seems that there are some english words but I think this are emojis
word_freqs_fr <- sort(colSums(dfm_list$fr), decreasing = TRUE)
head(word_freqs_fr, 20)
##    hes-so     right     arrow       dan    projet         a      tudi      haut 
##       505       432       324       249       248       234       199       183 
##       col   @hes_so @hessoval    dcouvr      book      open  recherch   #hes_so 
##       155       140       129       127       123       118       117       115 
##     suiss      plus      mast   nouveau 
##       110       105       103        98
wordcloud2(data.frame(
  word = names(word_freqs_fr),
  freq = word_freqs_fr
), size = 0.5)
# University-specific Analysis
for (uni in unique(tweets$university)) {
  # Subset tweets for the current language
  uni_tweets <- tweets_filtered %>%
    filter(university == uni)

  tokens_lang <- uni_tweets %>%
    corpus(text_field = "full_text_emojis") %>%
    tokens(
      remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE,
      remove_url = TRUE, remove_separators = TRUE
    ) %>%
    tokens_tolower() %>%
    tokens_wordstem() %>%
    tokens_ngrams(n = 1) %>%
    tokens_select(
      pattern =
        c(
          stopwords("en"), stopwords("de"),
          stopwords("fr"), stopwords("it"), extended_stopwords
        ), selection = "remove"
    )
  # Create Data Frame Matrix for uni with all languages
  uni_dfm <- dfm(tokens_lang)
  # Word Frequencies
  uni_word_freqs <- sort(colSums(uni_dfm), decreasing = TRUE)
  # print most common words: the emoji right are used often
  head(uni_word_freqs, 20)
  wordcloud2(data.frame(
    word = names(uni_word_freqs),
    freq = uni_word_freqs
  ), size = 0.5)
}

Userreaction Analysis

# Calculate a 'weighted engagement' metric
tweets <- tweets %>%
  mutate(
    weighted_engagement = favorite_count * 1 + retweet_count * 2
  )

# Identify tweets with the highest weighted engagement
most_engaged_tweets <- tweets %>%
  arrange(desc(weighted_engagement)) %>%
  head(1000) # Top 1000 for analysis

# Analyze posting time of most engaged tweets (same as before)
most_engaged_tweets_time <- most_engaged_tweets %>%
  mutate(time_of_day = format(created_at, "%H"))

ggplot(most_engaged_tweets_time, aes(x = as.numeric(time_of_day))) +
  geom_histogram(binwidth = 1, fill = "lightblue", color = "blue") +
  labs(
    title = "Distribution of Posting Times for Most Engaged Tweets",
    x = "Hour of Day",
    y = "Frequency"
  )

Analyse the content of the most liked tweets

# Preprocessing content of most liked tweets
tokens_most_engaged <- most_engaged_tweets %>%
  corpus(text_field = "full_text_emojis") %>%
  tokens(
    remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE,
    remove_url = TRUE, remove_separators = TRUE
  ) %>%
  tokens_tolower() %>%
  tokens_wordstem(lang = sel_lang) %>%
  tokens_ngrams(n = 1) %>%
  tokens_select(
    pattern =
      c(
        stopwords("en"), stopwords("de"),
        stopwords("fr"), stopwords("it"), extended_stopwords
      ), selection = "remove"
  )
tokens_most_engaged_dfm <- dfm(tokens_most_engaged)
freqs_most_engaged <- sort(colSums(tokens_most_engaged_dfm), decreasing = TRUE)
# print most common words: the emoji right are used often
head(freqs_most_engaged, 20)
##            mehr            neue         schweiz       schweizer           right 
##              81              67              48              47              46 
##            heut           zeigt #hsluinformatik           studi            zhaw 
##              44              41              40              39              39 
##          hes-so           knnen           neuen       hochschul          campus 
##              38              38              36              34              33 
##           innov            gibt              ab      entwickelt             bfh 
##              31              30              30              30              30
set.seed(123)
wordcloud2(data.frame(
  word = names(freqs_most_engaged),
  freq = freqs_most_engaged
), size = 0.5)

Question 3: How do the university tweets differ in terms of content, style, emotions, etc?

Content Analysis (Word Clouds)

for (uni in unique(tweets$university)) {
  uni_tweets <- tweets %>%
    filter(university == uni, lang %in% langs)
  tokens_uni <- uni_tweets %>%
    corpus(text_field = "full_text_emojis") %>%
    tokens(
      remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE,
      remove_url = TRUE, remove_separators = TRUE
    ) %>%
    tokens_tolower() %>%
    tokens_wordstem() %>%
    tokens_ngrams(n = 1) %>%
    tokens_select(
      pattern =
        c(
          stopwords("en"), stopwords("de"),
          stopwords("fr"), stopwords("it"), extended_stopwords
        ), selection = "remove"
    )
  uni_dfm <- dfm(tokens_uni)
  freqs_uni <- sort(colSums(uni_dfm), decreasing = TRUE)
  # print most common words: the emoji right are used often
  head(freqs_uni, 20)
  set.seed(123)
  wordcloud2(data.frame(
    word = names(freqs_uni),
    freq = freqs_uni
  ), size = 0.5)

  # Analyze Top Emojis by University
  emoji_count_per_university <- uni_tweets %>%
    top_n_emojis(full_text)

  print(emoji_count_per_university)

  emoji_count_per_university %>%
    mutate(emoji_name = reorder(emoji_name, n)) %>%
    ggplot(aes(n, emoji_name)) +
    geom_col() +
    labs(x = "Count", y = NULL, title = "Top 20 Emojis Used")
}
## # A tibble: 20 × 4
##    emoji_name                    unicode emoji_category        n
##    <chr>                         <chr>   <chr>             <int>
##  1 backhand_index_pointing_right 👉      People & Body        56
##  2 yellow_heart                  💛      Smileys & Emotion    34
##  3 black_heart                   🖤      Smileys & Emotion    32
##  4 woman                         👩      People & Body        28
##  5 man                           👨      People & Body        17
##  6 clap                          👏      People & Body        16
##  7 flag_Switzerland              🇨🇭      Flags                15
##  8 microscope                    🔬      Objects              15
##  9 computer                      💻      Objects              14
## 10 graduation_cap                🎓      Objects              13
## 11 school                        🏫      Travel & Places      13
## 12 face_with_medical_mask        😷      Smileys & Emotion    12
## 13 raised_hands                  🙌      People & Body        12
## 14 robot                         🤖      Smileys & Emotion    12
## 15 female_sign                   ♀️       Symbols              10
## 16 trophy                        🏆      Activities            9
## 17 woman_scientist               👩‍🔬      People & Body         9
## 18 party_popper                  🎉      Activities            8
## 19 star_struck                   🤩      Smileys & Emotion     8
## 20 sun_with_face                 🌞      Travel & Places       8
## # A tibble: 20 × 4
##    emoji_name                      unicode emoji_category        n
##    <chr>                           <chr>   <chr>             <int>
##  1 party_popper                    🎉      Activities           18
##  2 grinning_face_with_big_eyes     😃      Smileys & Emotion    15
##  3 blush                           😊      Smileys & Emotion     8
##  4 smiling_face_with_sunglasses    😎      Smileys & Emotion     8
##  5 bulb                            💡      Objects               7
##  6 +1                              👍      People & Body         6
##  7 camera_flash                    📸      Objects               6
##  8 flexed_biceps                   💪      People & Body         6
##  9 four_leaf_clover                🍀      Animals & Nature      6
## 10 grinning_face_with_smiling_eyes 😄      Smileys & Emotion     6
## 11 heart_eyes                      😍      Smileys & Emotion     6
## 12 hugs                            🤗      Smileys & Emotion     6
## 13 female_sign                     ♀️       Symbols               4
## 14 graduation_cap                  🎓      Objects               4
## 15 grinning                        😀      Smileys & Emotion     4
## 16 robot                           🤖      Smileys & Emotion     4
## 17 backhand_index_pointing_down    👇      People & Body         3
## 18 computer                        💻      Objects               3
## 19 lady_beetle                     🐞      Animals & Nature      3
## 20 ocean                           🌊      Travel & Places       3
## # A tibble: 20 × 4
##    emoji_name                    unicode emoji_category        n
##    <chr>                         <chr>   <chr>             <int>
##  1 backhand_index_pointing_right 👉      People & Body        21
##  2 high_voltage                  ⚡      Travel & Places      11
##  3 wink                          😉      Smileys & Emotion     9
##  4 clap                          👏      People & Body         5
##  5 flag_Switzerland              🇨🇭      Flags                 5
##  6 rocket                        🚀      Travel & Places       5
##  7 +1                            👍      People & Body         4
##  8 arrow_right                   ➡️       Symbols               4
##  9 bug                           🐛      Animals & Nature      3
## 10 computer                      💻      Objects               3
## 11 flexed_biceps                 💪      People & Body         3
## 12 man                           👨      People & Body         3
## 13 bangbang                      ‼️       Symbols               2
## 14 dark_skin_tone                🏿      Component             2
## 15 exclamation                   ❗      Symbols               2
## 16 female_sign                   ♀️       Symbols               2
## 17 four_leaf_clover              🍀      Animals & Nature      2
## 18 green_salad                   🥗      Food & Drink          2
## 19 grinning                      😀      Smileys & Emotion     2
## 20 medium_light_skin_tone        🏼      Component             2
## # A tibble: 20 × 4
##    emoji_name                    unicode emoji_category        n
##    <chr>                         <chr>   <chr>             <int>
##  1 backhand_index_pointing_right 👉      People & Body        49
##  2 battery                       🔋      Objects              16
##  3 backhand_index_pointing_down  👇      People & Body        12
##  4 woman                         👩      People & Body        12
##  5 palm_tree                     🌴      Animals & Nature     11
##  6 bulb                          💡      Objects              10
##  7 computer                      💻      Objects              10
##  8 evergreen_tree                🌲      Animals & Nature     10
##  9 graduation_cap                🎓      Objects              10
## 10 party_popper                  🎉      Activities           10
## 11 robot                         🤖      Smileys & Emotion    10
## 12 clap                          👏      People & Body         9
## 13 coconut                       🥥      Food & Drink          9
## 14 date                          📅      Objects               9
## 15 deciduous_tree                🌳      Animals & Nature      9
## 16 flag_Switzerland              🇨🇭      Flags                 9
## 17 rocket                        🚀      Travel & Places       9
## 18 automobile                    🚗      Travel & Places       8
## 19 clinking_glasses              🥂      Food & Drink          8
## 20 seedling                      🌱      Animals & Nature      8
## # A tibble: 20 × 4
##    emoji_name                    unicode emoji_category      n
##    <chr>                         <chr>   <chr>           <int>
##  1 arrow_right                   ➡️       Symbols           320
##  2 arrow_heading_down            ⤵️       Symbols           245
##  3 book                          📖      Objects           115
##  4 mag_right                     🔎      Objects            97
##  5 mega                          📣      Objects            53
##  6 clapper                       🎬      Objects            38
##  7 NEW_button                    🆕      Symbols            35
##  8 computer                      💻      Objects            35
##  9 microscope                    🔬      Objects            32
## 10 bulb                          💡      Objects            29
## 11 police_car_light              🚨      Travel & Places    27
## 12 backhand_index_pointing_right 👉      People & Body      26
## 13 graduation_cap                🎓      Objects            23
## 14 studio_microphone             🎙️       Objects            23
## 15 clap                          👏      People & Body      21
## 16 date                          📅      Objects            17
## 17 medal_sports                  🏅      Activities         15
## 18 memo                          📝      Objects            15
## 19 woman                         👩      People & Body      15
## 20 flag_Switzerland              🇨🇭      Flags              14
## # A tibble: 20 × 4
##    emoji_name                   unicode emoji_category        n
##    <chr>                        <chr>   <chr>             <int>
##  1 sparkles                     ✨      Activities           28
##  2 flag_Switzerland             🇨🇭      Flags                18
##  3 rocket                       🚀      Travel & Places      12
##  4 party_popper                 🎉      Activities           11
##  5 partying_face                🥳      Smileys & Emotion     9
##  6 Christmas_tree               🎄      Activities            7
##  7 clap                         👏      People & Body         7
##  8 star                         ⭐      Travel & Places       7
##  9 bottle_with_popping_cork     🍾      Food & Drink          6
## 10 bulb                         💡      Objects               5
## 11 glowing_star                 🌟      Travel & Places       5
## 12 smiling_face_with_sunglasses 😎      Smileys & Emotion     5
## 13 +1                           👍      People & Body         4
## 14 camera_flash                 📸      Objects               4
## 15 clinking_glasses             🥂      Food & Drink          4
## 16 four_leaf_clover             🍀      Animals & Nature      4
## 17 musical_notes                🎶      Objects               4
## 18 person_running               🏃      People & Body         4
## 19 raised_hands                 🙌      People & Body         4
## 20 robot                        🤖      Smileys & Emotion     4
## # A tibble: 20 × 4
##    emoji_name                    unicode emoji_category        n
##    <chr>                         <chr>   <chr>             <int>
##  1 graduation_cap                🎓      Objects               3
##  2 man                           👨      People & Body         2
##  3 man_student                   👨‍🎓      People & Body         2
##  4 rocket                        🚀      Travel & Places       2
##  5 snowflake                     ❄️       Travel & Places       2
##  6 backhand_index_pointing_right 👉      People & Body         1
##  7 brain                         🧠      People & Body         1
##  8 chocolate_bar                 🍫      Food & Drink          1
##  9 clapper                       🎬      Objects               1
## 10 eyes                          👀      People & Body         1
## 11 fire                          🔥      Travel & Places       1
## 12 flexed_biceps                 💪      People & Body         1
## 13 grinning                      😀      Smileys & Emotion     1
## 14 heart_eyes_cat                😻      Smileys & Emotion     1
## 15 high_voltage                  ⚡      Travel & Places       1
## 16 mantelpiece_clock             🕰️       Travel & Places       1
## 17 sleeping                      😴      Smileys & Emotion     1
## 18 slightly_smiling_face         🙂      Smileys & Emotion     1
## 19 sun                           ☀️       Travel & Places       1
## 20 woman                         👩      People & Body         1
## # A tibble: 20 × 4
##    emoji_name                    unicode emoji_category        n
##    <chr>                         <chr>   <chr>             <int>
##  1 arrow_right                   ➡️       Symbols              83
##  2 backhand_index_pointing_right 👉      People & Body        21
##  3 graduation_cap                🎓      Objects              19
##  4 arrow_forward                 ▶️       Symbols              18
##  5 bulb                          💡      Objects              10
##  6 rocket                        🚀      Travel & Places       9
##  7 party_popper                  🎉      Activities            8
##  8 flag_Switzerland              🇨🇭      Flags                 7
##  9 clap                          👏      People & Body         6
## 10 exclamation                   ❗      Symbols               5
## 11 SOON_arrow                    🔜      Symbols               4
## 12 grinning_face_with_big_eyes   😃      Smileys & Emotion     4
## 13 camera_flash                  📸      Objects               3
## 14 computer                      💻      Objects               3
## 15 movie_camera                  🎥      Objects               3
## 16 rainbow                       🌈      Travel & Places       3
## 17 studio_microphone             🎙️       Objects               3
## 18 woman                         👩      People & Body         3
## 19 Christmas_tree                🎄      Activities            2
## 20 backhand_index_pointing_down  👇      People & Body         2
# Generate general tokens for bigram and trigram analysis
tokens <- tweets %>%
  corpus(text_field = "full_text_emojis") %>%
  tokens(
    remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE,
    remove_url = TRUE, remove_separators = TRUE
  ) %>%
  tokens_tolower() %>%
  tokens_wordstem() %>%
  tokens_select(
    pattern =
      c(
        stopwords("en"), stopwords("de"),
        stopwords("fr"), stopwords("it"), extended_stopwords
      ), selection = "remove"
  )
# Bigram Wordcloud
bi_gram_tokens <- tokens_ngrams(tokens, n = 2)
dfm_bi_gram <- dfm(bi_gram_tokens)
freqs_bi_gram <- sort(colSums(dfm_bi_gram), decreasing = TRUE)
head(freqs_bi_gram, 20)
##          right_arrow             htw_chur          index_point 
##                  421                  259                  207 
##       backhand_index     hochschul_luzern          point_right 
##                  206                  185                  183 
## berner_fachhochschul        sozial_arbeit              prof_dr 
##                  157                  154                  142 
##            haut_cole      herzlich_gratul            open_book 
##                  141                  139                  117 
##        magnifi_glass           glass_tilt           tilt_right 
##                   97                   97                   97 
##         fh_graubnden  neusten_blogbeitrag   book_#revuehmisphr 
##                   91                   87                   85 
##         social_media         advanc_studi 
##                   84                   83
# Create the bigram word cloud
set.seed(123)
wordcloud2(data.frame(
  word = names(freqs_bi_gram),
  freq = freqs_bi_gram
), size = 0.5)
# Trigram Wordcloud
tri_gram_tokens <- tokens_ngrams(tokens, n = 3)
dfm_tri_gram <- dfm(tri_gram_tokens)
reqs_tri_gram <- sort(colSums(dfm_tri_gram), decreasing = TRUE)
head(reqs_tri_gram, 20)
##         backhand_index_point            index_point_right 
##                          206                          183 
##           magnifi_glass_tilt             glass_tilt_right 
##                           97                           97 
##      open_book_#revuehmisphr   hochschul_gestaltung_kunst 
##                           85                           62 
## dipartimento_tecnologi_innov          master_advanc_studi 
##                           40                           38 
##         depart_sozial_arbeit       #infoanlass_mrz_findet 
##                           36                           33 
##              polic_car_light         univers_appli_scienc 
##                           32                           31 
##         busi_administr_statt     findet_#zrich_infoanlass 
##                           30                           30 
##               tag_offenen_tr        hochschul_life_scienc 
##                           29                           29 
##        gestaltung_kunst_fhnw           mas_busi_administr 
##                           29                           28 
##       mehr_neuen_blogbeitrag     mehr_neusten_blogbeitrag 
##                           28                           28
# Create the bigram word cloud
set.seed(123)
wordcloud2(data.frame(
  word = names(reqs_tri_gram),
  freq = reqs_tri_gram
), size = 0.5)

LDA Topic Modeling

# Source: Christoph Zangger -> löscht alle Reihen mit nur 0s
new_dfm <- dfm_subset(dfm_list$en, ntoken(dfm_list$en) > 0)
tweet_lda <- LDA(new_dfm, k = 5, control = list(seed = 123))
# Tidy the LDA results
topic_terms <- tidy(tweet_lda, matrix = "beta")
# Extract topics and top terms
topics <- as.data.frame(terms(tweet_lda, 50)) # First fifty words per topic

# Extract top terms per topic
top_terms <- topic_terms %>%
  group_by(topic) %>%
  top_n(8, beta) %>% # Show top 8 terms per topic
  ungroup() %>%
  arrange(topic, -beta)

# Visualize top terms per topic
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~topic, scales = "free") +
  scale_y_reordered() +
  labs(
    x = "Beta (Term Importance within Topic)",
    y = NULL,
    title = "Top Terms per Topic in Tweets (LDA)"
  )

# Most different words among topics (using log ratios)
diff <- topic_terms %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001 | topic3 > .001) %>%
  mutate(
    logratio_t1t2 = log2(topic2 / topic1),
    logratio_t1t3 = log2(topic3 / topic1),
    logratio_t2t3 = log2(topic3 / topic2)
  )
diff
## # A tibble: 318 × 9
##    term       topic1  topic2  topic3  topic4  topic5 logratio_t1t2 logratio_t1t3
##    <chr>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl>         <dbl>         <dbl>
##  1 @academi… 7.84e-4 2.67e-3 3.06e-3 2.69e-3 1.60e-3         1.77          1.96 
##  2 @bfh_hesb 3.20e-3 1.80e-3 4.26e-3 5.17e-3 9.55e-4        -0.832         0.414
##  3 @eth_en   4.35e-4 1.53e-3 1.51e-4 3.36e-4 4.54e-4         1.82         -1.52 
##  4 @fh_grau… 1.52e-4 3.37e-4 1.82e-3 5.07e-4 1.75e-3         1.15          3.59 
##  5 @fhnw     1.01e-2 5.26e-4 3.28e-3 3.29e-3 1.89e-3        -4.27         -1.63 
##  6 @fhnwbusi 3.44e-3 2.31e-3 5.31e-3 3.71e-4 5.18e-3        -0.577         0.627
##  7 @globalc… 1.18e-3 5.89e-4 1.82e-4 4.27e-4 5.33e-4        -0.999        -2.70 
##  8 @greater… 1.21e-4 6.38e-4 1.91e-3 1.92e-3 8.10e-4         2.40          3.98 
##  9 @grstift… 2.02e-3 3.23e-3 7.09e-4 9.99e-4 1.77e-3         0.677        -1.51 
## 10 @hes_so   1.38e-3 2.38e-3 1.69e-3 5.81e-4 1.86e-3         0.786         0.290
## # ℹ 308 more rows
## # ℹ 1 more variable: logratio_t2t3 <dbl>
# LDA Topic Modeling for each university
universities <- unique(tweets$university)

for (uni in universities) {
  # Filter tweets for the current university
  uni_tweets <- tweets %>% filter(university == uni)

  tokens_uni <- uni_tweets %>%
    corpus(text_field = "full_text_emojis") %>%
    tokens(
      remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE,
      remove_url = TRUE, remove_separators = TRUE
    ) %>%
    tokens_tolower() %>%
    tokens_wordstem() %>%
    tokens_ngrams(n = 1) %>%
    tokens_select(
      pattern =
        c(
          stopwords("en"), stopwords("de"),
          stopwords("fr"), stopwords("it"), extended_stopwords
        ), selection = "remove"
    )
  uni_dfm <- dfm(tokens_uni)
  # Apply LDA
  uni_dfm <- dfm_subset(uni_dfm, ntoken(uni_dfm) > 0)
  tweet_lda <- LDA(uni_dfm, k = 5, control = list(seed = 123))
  # Tidy the LDA results
  tweet_lda_td <- tidy(tweet_lda)
  # Extract top terms per topic
  top_terms <- tweet_lda_td %>%
    group_by(topic) %>%
    top_n(8, beta) %>%
    ungroup() %>%
    arrange(topic, -beta)
  # Visualize top terms per topic
  p <- top_terms %>%
    mutate(term = reorder_within(term, beta, topic)) %>%
    ggplot(aes(beta, term, fill = factor(topic))) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~topic, scales = "free") +
    scale_y_reordered() +
    labs(
      x = "Beta (Term Importance within Topic)",
      y = NULL,
      title = paste("Top Terms per Topic in Tweets from", uni, "(LDA)")
    )
  print(p)
  # Topic Model Summary: top 10 terms per topic
  cat("\nTopic Model Summary for", uni, ":\n")
  print(as.data.frame(terms(tweet_lda, 10)))
}

## 
## Topic Model Summary for FHNW :
##         Topic 1         Topic 2         Topic 3      Topic 4         Topic 5
## 1     hochschul       @fhnwbusi            fhnw         mehr        @hsafhnw
## 2  @fhnwtechnik           @fhnw            mehr    @fhnwbusi       @fhnwbusi
## 3      @hsafhnw              ab            heut        @fhnw           @fhnw
## 4          neue            mehr      studierend     challeng       hochschul
## 5          fhnw       interview         projekt   studierend           neuen
## 6        campus @fhnwpsychologi              dr      projekt          campus
## 7          mehr       hochschul            neue         fhnw @fhnwpsychologi
## 8    gestaltung          kinder       @fhnwbusi     @hsafhnw    @fhnwtechnik
## 9          dass           statt @fhnwpsychologi @fhnwtechnik            mehr
## 10        kunst    #campusolten        backhand        swiss         schweiz

## 
## Topic Model Summary for FH_Graubuenden :
##        Topic 1     Topic 2     Topic 3     Topic 4         Topic 5
## 1         chur         htw    #htwchur      findet            chur
## 2          htw        mehr blogbeitrag        chur           statt
## 3   #graubnden       #fhgr       statt       statt        #htwchur
## 4     #htwchur       statt       #fhgr  infoanlass     blogbeitrag
## 5  blogbeitrag   graubnden #infoanlass    #htwchur          findet
## 6    graubnden      findet   graubnden        mehr           thema
## 7         mehr       neuen   tourismus #infoanlass @suedostschweiz
## 8        #chur        chur       manag         htw    studierenden
## 9     #schweiz    #studium      findet        busi           oktob
## 10        heut #infoanlass  studierend        heut            face

## 
## Topic Model Summary for ZHAW :
##        Topic 1     Topic 2          Topic 3          Topic 4          Topic 5
## 1         zhaw       @zhaw             gibt @engineeringzhaw             zhaw
## 2         heut #zhawimpact            @zhaw          schweiz            @zhaw
## 3        #zhaw         via @engineeringzhaw            zeigt       winterthur
## 4         dank        zhaw               cc            #zhaw            neuen
## 5        @zhaw        mehr        schweizer               cc        @sml_zhaw
## 6  @c_caviglia       #zhaw             neue             neue            knnen
## 7          via          ab             mehr            studi @engineeringzhaw
## 8           cc          cc             heut         #tonitag         menschen
## 9     wdenswil        neue     studierenden             dank             neue
## 10       studi  winterthur             zhaw            knnen             mehr

## 
## Topic Model Summary for bfh :
##          Topic 1  Topic 2       Topic 3         Topic 4   Topic 5
## 1            bfh    thema          bern             bfh    berner
## 2           mehr     neue     @bfh_hesb       @bfh_hesb      neue
## 3           bern    statt fachhochschul            bern     thema
## 4         berner @hkb_bfh          http          berner      bern
## 5  fachhochschul     bern          mehr            mehr @bfh_hesb
## 6           biel    neuen          neue           knnen       bfh
## 7           heut     mehr         thema           studi     studi
## 8     entwickelt    onlin         knnen           innen  herzlich
## 9       anmelden    zeigt           neu #knoten_maschen schweizer
## 10         innen      bfh         neuen           welch    sozial

## 
## Topic Model Summary for hes_so :
##        Topic 1     Topic 2 Topic 3       Topic 4  Topic 5
## 1          dan      hes-so  hes-so          open    right
## 2        arrow       right   right          book   hes-so
## 3       projet        haut   arrow          haut    arrow
## 4         plus       arrow tudiant        projet      dan
## 5       scienc @hessovalai  projet       @hes_so   projet
## 6         book      domain magnifi          cole  @hes_so
## 7        right    recherch    cole           dan  #hes_so
## 8     dcouvrez     tudiant    plus #revuehmisphr   master
## 9      tudiant        cole    haut        master     tilt
## 10 @hessovalai  professeur #hes_so       tudiant recherch

## 
## Topic Model Summary for hslu :
##         Topic 1   Topic 2         Topic 3         Topic 4   Topic 5
## 1        luzern      mehr          luzern #hsluinformatik      mehr
## 2         @hslu     @hslu #hsluinformatik           @hslu hochschul
## 3          mehr schweizer            neue            neue     @hslu
## 4        depart    depart          depart           knnen     neuen
## 5     schweizer hochschul           @hslu            gibt     zeigt
## 6            ab      heut         schweiz           zeigt      geht
## 7         kunst     kunst        menschen #hsluwirtschaft interview
## 8  studierenden    design      entwickelt       hochschul     studi
## 9        design      jahr            heut         projekt      jahr
## 10       arbeit   studium            geht           welch    arbeit

## 
## Topic Model Summary for ost_fh :
##                       Topic 1                    Topic 2
## 1  #ostschweizerfachhochschul                        ost
## 2                        neue #ostschweizerfachhochschul
## 3                     @ost_fh                       mehr
## 4                     podcast                 ostschweiz
## 5                    #podcast                    studium
## 6                    kontrast              fachhochschul
## 7                       neuen                   menschen
## 8                kulturzyklus                   @ozg_ost
## 9               #kulturzyklus                         ab
## 10                       zwei                     detail
##                       Topic 3                    Topic 4
## 1                     @ost_fh                    @ost_fh
## 2                    @ozg_ost #ostschweizerfachhochschul
## 3                   st.gallen                        ost
## 4  #ostschweizerfachhochschul                    @ost_wi
## 5                        mehr                 rapperswil
## 6                         ost                   @ozg_ost
## 7                  @eastdigit                       drei
## 8                 #informatik                    septemb
## 9                         neu                       #ost
## 10                   spannend                     campus
##                       Topic 5
## 1                     @ost_fh
## 2  #ostschweizerfachhochschul
## 3                        mehr
## 4                         ost
## 5                        prof
## 6              @insrapperswil
## 7                    herzlich
## 8                     projekt
## 9                   st.gallen
## 10                 ostschweiz

## 
## Topic Model Summary for supsi_ch :
##         Topic 1      Topic 2      Topic 3    Topic 4   Topic 5
## 1      #supsiev informazioni        supsi iscrizioni     supsi
## 2     @supsi_ch     bachelor     #supsiev      right formazion
## 3        lavoro         info    #supsinew   #supsiev        pi
## 4          info         oggi        manag      innov     nuovo
## 5         nuovo     progetto    formazion     ticino #supsinew
## 6      studenti @usi_univers     settembr       info  progetto
## 7     #supsinew        studi       master       oggi    social
## 8  @usi_univers        apert presentazion     master     apert
## 9         supsi       scopri    tecnologi      arrow   ricerca
## 10     progetto     studenti dipartimento conferenza     corsi

Style Analysis

tweets %>%
  mutate(tweet_length = nchar(full_text)) %>%
  ggplot(aes(x = tweet_length)) +
  geom_histogram() +
  labs(title = "Distribution of Tweet Lengths")

### Sentiment Analysis

# Calculate Sentiment for Supported Languages Only
langs <- c("de", "fr", "it", "en")

tweets_filtered <- tweets %>%
  filter(lang %in% langs)

# Create Function to Get Syuzhet Sentiment
get_syuzhet_sentiment <- function(text, lang) {
  # Check if language is supported
  if (lang %in% langs) {
    return(get_sentiment(text, method = "syuzhet", lang = lang))
  } else {
    return(NA) # Return NA for unsupported languages
  }
}

# Calculate Syuzhet Sentiment for each Tweet
tweets_filtered$sentiment <-
  mapply(get_syuzhet_sentiment, tweets_filtered$full_text, tweets_filtered$lang)


plot_data <- tweets_filtered %>%
  group_by(university, month) %>%
  summarize(mean_sentiment_syuzhet = mean(sentiment, na.rm = TRUE))

# Plot Syuzhet Sentiment by all Universities
ggplot(plot_data, aes(
  x = month,
  y = mean_sentiment_syuzhet,
  color = university, group = university
)) +
  geom_line() +
  labs(
    title = "Mean Syuzhet Sentiment Over Time by University",
    y = "Mean Sentiment Score"
  ) +
  scale_x_datetime(date_breaks = "1 month", date_labels = "%Y-%m") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

for (uni in unique(tweets$university)) {
  uni_tweets <- tweets %>%
    filter(university == uni, lang %in% langs)

  uni_tweets$sentiment <-
    mapply(get_syuzhet_sentiment, uni_tweets$full_text, uni_tweets$lang)


  plot_data <- uni_tweets %>%
    group_by(year, month) %>%
    summarize(mean_sentiment = mean(sentiment, na.rm = TRUE))

  # Plot Syuzhet Sentiment Over Time (Per University)
  print(ggplot(plot_data, aes(x = month, y = mean_sentiment)) +
    geom_line(aes(color = as.factor(year))) +
    labs(
      title = paste0("Mean Syuzhet Sentiment Over Time by - ", uni),
      y = "Mean Sentiment Score"
    ) +
    scale_x_datetime(date_breaks = "1 month", date_labels = "%Y-%m") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    facet_wrap(~year, scales = "free_x"))

  # Did not found a way to get the sentiment from the tweets for each language so I will use the full_text_emojis column and detect the language of the words only in german
  uni_tweets_de <- uni_tweets %>%
    filter(lang == "de")
  # Tokenize and Preprocess Words
  uni_words <- uni_tweets_de %>%
    unnest_tokens(word, full_text_emojis) %>%
    anti_join(get_stopwords(language = "de")) %>%
    distinct() %>% # remove duplicated words
    filter(nchar(word) > 3) %>% # remove words with less than 4 characters
    filter(!str_detect(word, "\\d")) # remove numbers

  # Join Sentiment with Words (language specific)
  sentiment_words <- uni_words %>%
    mutate(
      sentiment = get_sentiment(word, method = "syuzhet", lang = "de")
    )

  # Separate Positive and Negative Words
  positive_words <- sentiment_words %>%
    filter(sentiment >= 0) %>%
    count(word, sort = TRUE) %>%
    rename(freq = n) # Rename 'n' to 'freq'

  negative_words <- sentiment_words %>%
    filter(sentiment < 0) %>%
    count(word, sort = TRUE) %>%
    rename(freq = n) # Rename 'n' to 'freq'

  # Create and Display Word Clouds
  # positive words
  wordcloud2(data.frame(
    word = positive_words$word,
    freq = positive_words$freq
  ), size = 0.5)
  # negative words
  wordcloud2(data.frame(
    word = negative_words$word,
    freq = negative_words$freq
  ), size = 0.5)
}

Question 4: What specific advice can you give us as communication department of BFH based on your analysis? How can we integrate the analysis of tweets in our internal processes, can you think of any data products that would be of value for us?

Summary key insights from the analysis

# Language Analysis
tweets %>%
  count(lang) %>%
  arrange(desc(n))
## # A tibble: 127 × 3
## # Groups:   university [8]
##    university     lang      n
##    <chr>          <chr> <int>
##  1 hslu           de     2902
##  2 bfh            de     2760
##  3 ZHAW           de     2712
##  4 FHNW           de     2353
##  5 FH_Graubuenden de     2245
##  6 supsi_ch       it     1786
##  7 hes_so         fr     1663
##  8 FH_Graubuenden <NA>    374
##  9 ost_fh         de      248
## 10 ZHAW           <NA>    225
## # ℹ 117 more rows
# Emoji Analysis
emoji_count <- tweets %>%
  top_n_emojis(full_text)

emoji_count %>%
  mutate(emoji_name = reorder(emoji_name, n)) %>%
  ggplot(aes(n, emoji_name)) +
  geom_col() +
  labs(x = "Count", y = NULL, title = "Top 20 Emojis Used")

insights <- list(
  "Most Active Hours" = hours_with_most_tweets_by_uni,
  "Most Active Days" = days_with_most_tweets_by_uni,
  "Content Analysis" = head(words_freqs_de),
  "Sentiment Analysis" = head(tweets_filtered$sentiment)
)

Recommendations:

1. Optimize tweet release times based on peak engagement hours.

2. Focus on specific days with high activity for important announcements.

3. Utilize sentiment analysis to tailor content that resonates positively with the audience.

4. Implement topic modeling to identify key themes and align communication strategy accordingly.